Taek_Blog - Opendata

Import & Write

1 오픈데이터 분석 실습 : Import/Write Data

1.1 패키지 불러오기

read_csv 기능 존재

library(tidyverse)

2 Import Data

2.1 read_csv

read.csv와 다르게 문자열 factor 처리X

### 현재 경로 확인
getwd()

[1] "G:/내 드라이브/taek_blog/posts/Opendata_Analysis Ch3"

### 경로 설정
#setwd()

### csv파일 불러오기
heights <- read_csv("heights.csv")
heights

# A tibble: 1,192 × 6
    earn height sex       ed   age race    
   <dbl>  <dbl> <chr>  <dbl> <dbl> <chr>   
 1 50000   74.4 male      16    45 white   
 2 60000   65.5 female    16    58 white   
 3 30000   63.6 female    16    29 white   
 4 50000   63.1 female    16    91 other   
 5 51000   63.4 female    17    39 white   
 6  9000   64.4 female    15    26 white   
 7 29000   61.7 female    12    49 white   
 8 32000   72.7 male      17    46 white   
 9  2000   72.0 male      15    21 hispanic
10 27000   72.2 male      12    26 white   
# … with 1,182 more rows

### 절대경로 사용
heights <- read_csv("C:/Users/seong taek/Desktop/3-1 Opendata_Analysis/opendata/heights.csv")
heights

# A tibble: 1,192 × 6
    earn height sex       ed   age race    
   <dbl>  <dbl> <chr>  <dbl> <dbl> <chr>   
 1 50000   74.4 male      16    45 white   
 2 60000   65.5 female    16    58 white   
 3 30000   63.6 female    16    29 white   
 4 50000   63.1 female    16    91 other   
 5 51000   63.4 female    17    39 white   
 6  9000   64.4 female    15    26 white   
 7 29000   61.7 female    12    49 white   
 8 32000   72.7 male      17    46 white   
 9  2000   72.0 male      15    21 hispanic
10 27000   72.2 male      12    26 white   
# … with 1,182 more rows

### tibble 데이터 프레임 생성
read_csv("a,b,c
         1,2,3
         4,5,6")

# A tibble: 2 × 3
      a     b     c
  <dbl> <dbl> <dbl>
1     1     2     3
2     4     5     6

### 라인 스킵
read_csv("The first line of metadata
         The second line of metadata
         x,y,z
         1,2,3" , skip=2)

# A tibble: 1 × 3
      x     y     z
  <dbl> <dbl> <dbl>
1     1     2     3

### 주석 스킵
read_csv("#A comment I want to skip
         x,y,z
         1,2,3", comment="#")

# A tibble: 1 × 3
      x     y     z
  <dbl> <dbl> <dbl>
1     1     2     3

### 컬럼 이름 없이 내용만
read_csv("1,2,3
         4,5,6", col_names = F)

# A tibble: 2 × 3
     X1    X2    X3
  <dbl> <dbl> <dbl>
1     1     2     3
2     4     5     6

### '\n' : 한줄 띄우기
read_csv("1,2,3 \n 4,5,6,", col_names = F)

# A tibble: 2 × 3
     X1    X2    X3
  <dbl> <dbl> <dbl>
1     1     2     3
2     4     5     6

### 컬럼 이름 지정
read_csv("1,2,3 \n 4,5,6,", col_names = c("A","B","C"))

# A tibble: 2 × 3
      A     B     C
  <dbl> <dbl> <dbl>
1     1     2     3
2     4     5     6

### NA값 부여
read_csv("a,b,c \n 1,2,.", na=".")

# A tibble: 1 × 3
      a     b c    
  <dbl> <dbl> <lgl>
1     1     2 NA

2.2 Locale 설정/확인

Sys.getlocale()

[1] "LC_COLLATE=Korean_Korea.utf8;LC_CTYPE=ko_KR.UTF-8;LC_MONETARY=Korean_Korea.utf8;LC_NUMERIC=C;LC_TIME=Korean_Korea.utf8"

### 언어 영어로
#Sys.setlocale("LC_ALL", "English")

### 강제 언어 삭제
#Sys.setlocale("LC_ALL", "C")

2.3 한글 파일 읽기

### 인코딩 찾기
guess_encoding("exercise.csv")

# A tibble: 2 × 2
  encoding   confidence
  <chr>           <dbl>
1 EUC-KR           1   
2 IBM420_ltr       0.25

### 인코딩 입력으로 에러해결
exercise <- read_csv("exercise.csv", locale = locale(encoding = "EUC-KR"))
exercise

# A tibble: 5 × 2
  이름  선호도
  <chr>  <dbl>
1 하민       5
2 하준       4
3 하진       4
4 태산       3
5 태민       2

### csv파일을 미리 열어보고 인코딩 변경
exercise <- read_csv("exercise_utf_8.csv")
exercise

# A tibble: 5 × 2
  이름  선호도
  <chr>  <dbl>
1 하민       5
2 하준       4
3 하진       4
4 태산       3
5 태민       2

guess_encoding("exercise_utf_8.csv")

# A tibble: 3 × 2
  encoding     confidence
  <chr>             <dbl>
1 UTF-8              1   
2 windows-1255       0.38
3 windows-1255       0.29

3 Write Data

3.1 파일 저장/삭제

heights <- read_csv("heights.csv")
heights

# A tibble: 1,192 × 6
    earn height sex       ed   age race    
   <dbl>  <dbl> <chr>  <dbl> <dbl> <chr>   
 1 50000   74.4 male      16    45 white   
 2 60000   65.5 female    16    58 white   
 3 30000   63.6 female    16    29 white   
 4 50000   63.1 female    16    91 other   
 5 51000   63.4 female    17    39 white   
 6  9000   64.4 female    15    26 white   
 7 29000   61.7 female    12    49 white   
 8 32000   72.7 male      17    46 white   
 9  2000   72.0 male      15    21 hispanic
10 27000   72.2 male      12    26 white   
# … with 1,182 more rows

### 현재 경로에 csv파일 저장
write_csv(heights, "만들 파일 이름.csv")

### rds 확장자 
write_rds(heights, "만들 파일 이름.rds")
read_rds("만들 파일 이름.rds")

# A tibble: 1,192 × 6
    earn height sex       ed   age race    
   <dbl>  <dbl> <chr>  <dbl> <dbl> <chr>   
 1 50000   74.4 male      16    45 white   
 2 60000   65.5 female    16    58 white   
 3 30000   63.6 female    16    29 white   
 4 50000   63.1 female    16    91 other   
 5 51000   63.4 female    17    39 white   
 6  9000   64.4 female    15    26 white   
 7 29000   61.7 female    12    49 white   
 8 32000   72.7 male      17    46 white   
 9  2000   72.0 male      15    21 hispanic
10 27000   72.2 male      12    26 white   
# … with 1,182 more rows

### 파일 삭제
file.remove("만들 파일 이름.csv")

[1] TRUE

3.2 feather 패키지

#install.packages("feather")
library(feather)

write_feather(heights, "heights.feather")
read_feather("heights.feather")

# A tibble: 1,192 × 6
    earn height sex       ed   age race    
   <dbl>  <dbl> <chr>  <dbl> <dbl> <chr>   
 1 50000   74.4 male      16    45 white   
 2 60000   65.5 female    16    58 white   
 3 30000   63.6 female    16    29 white   
 4 50000   63.1 female    16    91 other   
 5 51000   63.4 female    17    39 white   
 6  9000   64.4 female    15    26 white   
 7 29000   61.7 female    12    49 white   
 8 32000   72.7 male      17    46 white   
 9  2000   72.0 male      15    21 hispanic
10 27000   72.2 male      12    26 white   
# … with 1,182 more rows